In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
First, I made a mistake naming the data set! It's 2015 data, not 2014 data. But yes, still use
311-2014.csv
. You can rename it.
1.Import your data, but only the first 200,000 rows. You'll also want to change the index to be a datetime based on the Created Date column - you'll want to check if it's already a datetime, and parse it if not.
In [2]:
#200,000 rows giving errors, so imported only 200,00 rows :-) to solve the loading issues and memory error.
df = pd.read_csv("small-311-2015.csv")
df.head(5)
Out[2]:
In [3]:
df.columns.values
Out[3]:
In [4]:
dateutil.parser.parse("07/04/2015 03:33:09 AM")
Out[4]:
In [5]:
df.info()
In [6]:
def parse_date(str_date):
return dateutil.parser.parse(str_date)
df['created_datetime'] = df['Created Date'].apply(parse_date)
df.head(2)
Out[6]:
In [7]:
df.index = df['created_datetime']
In [8]:
df.head(5)
Out[8]:
2.What was the most popular type of complaint, and how many times was it filed?
In [9]:
df['Complaint Type'].value_counts().head(5)
Out[9]:
3.Make a horizontal bar graph of the top 5 most frequent complaint types.
In [10]:
df['Complaint Type'].value_counts().head(5).plot(kind='barh', y='Complaint Type')
Out[10]:
4.Which borough has the most complaints per capita? Since it's only 5 boroughs, you can do the math manually.
In [11]:
columns_to_show = ['Park Borough', 'Complaint Type']
complaints = df[columns_to_show]
complaints.head()
Out[11]:
In [12]:
new_list = pd.DataFrame(complaints['Park Borough'].value_counts())
new_list.head(6)
Out[12]:
In [13]:
new_comp = complaints.merge(new_list, left_on='Park Borough', right_index=True)
new_comp.head(3)
Out[13]:
In [14]:
new_comp['count'] = new_comp['Park Borough_y']
In [15]:
new_comp.head()
Out[15]:
In [16]:
new_complaints = new_comp.drop(['Park Borough_x','Park Borough_y'], axis=1)
In [17]:
new_complaints.head()
Out[17]:
In [18]:
new_list.head(6)
Out[18]:
In [19]:
#mh 1,636,268, Bronx 1,438,159 brooklyn 2,621,793, queens -2,321,580 stalen
per_capita = {'BROOKLYN':2621793,'QUEENS': 2321580,'MANHATTAN':1636268,'BRONX':1438159, 'STATEN ISLAND':472621, 'Unspecified':0}
my_list = pd.DataFrame.from_dict(per_capita,orient='index')
my_list.head()
Out[19]:
In [20]:
my_list.columns = ['population']
In [21]:
my_list
Out[21]:
In [22]:
per_cap = new_list.merge(my_list, left_index=True, right_index=True)
per_cap.head(6)
Out[22]:
In [23]:
per_cap['count'] = per_cap['Park Borough']
per_cap.drop('Park Borough', axis=1)
Out[23]:
In [24]:
#finally calculating complaints per capita
per_cap['per_capital'] = per_cap['Park Borough'] / per_cap['population']
per_cap.head(6).sort_values(by='per_capital', ascending=False)
Out[24]:
6.According to your selection of data, how many cases were filed in March? How about May?
In [25]:
df.head(3)
Out[25]:
In [26]:
march = df['2015-03']
In [27]:
total_march = pd.value_counts(march['Complaint Type'],sort=True)
total_march
Out[27]:
In [28]:
print("The total number of complaints is", total_march.sum())
In [29]:
may = df['2015-05']
In [30]:
total_may = pd.value_counts(may['Complaint Type'],sort=True)
total_may
Out[30]:
In [31]:
print("The total number of complaints is", total_may.sum())
7). I'd like to see all of the 311 complaints called in on April 1st.
Surprise! We couldn't do this in class, but it was just a limitation of our data set
In [32]:
April = df['2015-04']
April.sort('created_datetime').head()
Out[32]:
In [33]:
April[:"20150401"]
Out[33]:
8.What was the most popular type of complaint on April 1st?
In [34]:
april_complaints = April[:"20150401"]
pd.value_counts(april_complaints['Complaint Type'],sort=True).head(1)
Out[34]:
9.What were the most popular three types of complaint on April 1st
In [35]:
pd.value_counts(april_complaints['Complaint Type'],sort=True).head(3)
Out[35]:
10) What month has the most reports filed? How many? Graph it.
In [36]:
df.head(3)
Out[36]:
In [37]:
df.resample('M').count().sort_values('Created Date', ascending=False).head(1)
Out[37]:
In [38]:
ax = df.resample('M').count().plot(y='Created Date')
ax.set_title("Monthly Complaints")
Out[38]:
11) What week of the year has the most reports filed? How many? Graph the weekly complaints.
In [39]:
df.resample('W').count().sort_values('Created Date', ascending=False).head(1)
Out[39]:
In [40]:
ax = df.resample('W').count().plot(y='Created Date')
ax.set_title("Weekly Complaints")
Out[40]:
12). Noise complaints are a big deal. Use .str.contains
to select noise complaints, and make an chart of when they show up annually. Then make a chart about when they show up every day (cyclic).
In [41]:
columns_req = ['Complaint Type']
find_noice= df[columns_req]
find_noice.head()
Out[41]:
In [42]:
df_noice = find_noice[find_noice['Complaint Type'].str.contains("Noise", case=False)]
In [43]:
df_noice.head(3)
Out[43]:
In [44]:
df_noice.resample('D').count().head(4)
Out[44]:
In [45]:
ax = df_noice.resample('M').count().plot(y='Complaint Type')
ax.set_title("Noice Complaints in Year 2015")
Out[45]:
In [46]:
ax = df_noice.resample('D').count().plot(y='Complaint Type')
ax.set_title("Daily Noice Complaints")
Out[46]:
13) Which were the top five days of the year for filing complaints? How many on each of those days? Graph it.
In [47]:
top_days = find_noice.resample('D').count().sort_values('Complaint Type', ascending=False)
top_days.head(5)
Out[47]:
In [48]:
top_days['complaint count'] = top_days['Complaint Type']
In [49]:
t_days = top_days.drop('Complaint Type', axis=1).head(5)
In [50]:
ax = t_days['complaint count'].plot(kind='bar',x= 't_days.index', y='complaint count')
ax.set_title("Top Five Days with the Highest Number of Complaints")
Out[50]:
14) What hour of the day are the most complaints? Graph a day of complaints.
In [51]:
df['Unique Key'].groupby(by=df.index.hour).count()
Out[51]:
In [52]:
ax = df['Unique Key'].groupby(by=df.index.hour).count().plot()
ax.set_title("Complaint Flow in a Typical Day")
Out[52]:
15) . One of the hours has an odd number of complaints. What are the most common complaints at that hour, and what are the most common complaints the hour before and after?
In [53]:
df.head(3)
Out[53]:
In [54]:
# 0 stands for 12.0 midnight
df[df.index.hour==0].head(3)
Out[54]:
In [55]:
most_comp = df[df.index.hour==0]
In [56]:
# after filtering out, counted all the unique values in the Complaint Type.
most_comp['Complaint Type'].value_counts().head(3)
Out[56]:
In [57]:
most_comp_before = df[df.index.hour==23]
In [58]:
most_comp_before['Complaint Type'].value_counts().head(3)
Out[58]:
In [59]:
most_comp_after = df[df.index.hour==1]
In [60]:
most_comp_after['Complaint Type'].value_counts().head(3)
Out[60]:
16 ) So odd. What's the per-minute breakdown of complaints between 12am and 1am? You don't need to include 1am.
In [61]:
most_comp.head(3)
Out[61]:
In [62]:
most_comp['Complaint Type'].groupby(by=most_comp.index.minute).value_counts()
Out[62]:
17) Looks like midnight is a little bit of an outlier. Why might that be? Take the 5 most common agencies and graph the times they file reports at (all day, not just midnight).
In [63]:
df['Agency'].value_counts().head(5)
Out[63]:
In [87]:
agency = df[(df['Agency'] == 'NYPD') | (df['Agency'] == 'DOT') | (df['Agency'] == 'DPR') | (df['Agency'] == 'HPD') | (df['Agency'] == 'DOHMH')]
In [65]:
agency.head(3)
Out[65]:
In [88]:
nypd=agency[(agency['Agency'] == 'NYPD')]
dot=agency[(agency['Agency'] == 'DOT')]
dpr=agency[(agency['Agency'] == 'DPR')]
hpd=agency[(agency['Agency'] == 'HPD')]
dohmh=agency[(agency['Agency'] == 'DOHMH')]
In [89]:
dohmh.head(3)
Out[89]:
In [94]:
ax=nypd.groupby(by=nypd.index.hour).count().plot(y='Unique Key', label='NYPD')
dot.groupby(by=dot.index.hour).count().plot(y='Unique Key', ax=ax, label='DOT')
dpr.groupby(by=dpr.index.hour).count().plot(y='Unique Key',ax=ax, label='DPR')
hpd.groupby(by=hpd.index.hour).count().plot(y='Unique Key', ax=ax, label='HPD')
dohmh.groupby(by=dohmh.index.hour).count().plot(y='Unique Key', ax=ax, label='DOHMH')
Out[94]:
In [ ]:
18) Graph those same agencies on an annual basis - make it weekly. When do people like to complain? When does the NYPD have an odd number of complaints?
In [95]:
ax=nypd.groupby(by=nypd.index.week).count().plot(y='Unique Key', label='NYPD')
dot.groupby(by=dot.index.week).count().plot(y='Unique Key', ax=ax, label='DOT')
dpr.groupby(by=dpr.index.week).count().plot(y='Unique Key',ax=ax, label='DPR')
hpd.groupby(by=hpd.index.week).count().plot(y='Unique Key', ax=ax, label='HPD')
dohmh.groupby(by=dohmh.index.week).count().plot(y='Unique Key', ax=ax, label='DOHMH')
Out[95]:
19) Maybe the NYPD deals with different issues at different times? Check the most popular complaints in July and August vs the month of May. Also check the most common complaints for the Housing Preservation Bureau (HPD) in winter vs. summer.
In [ ]:
july_aug = nypd[(nypd.index.month==7)| (nypd.index.month==8)]
In [ ]:
july_aug['Complaint Type'].value_counts().head(5)
In [ ]:
may = nypd[(nypd.index.month==5)]
In [ ]:
may['Complaint Type'].value_counts().head(5)
In [ ]:
winter = hpd[(hpd.index.month==12)| (hpd.index.month==1)| (hpd.index.month==2)]
In [ ]:
winter['Complaint Type'].value_counts().head(5)
In [ ]:
summer = hpd[(hpd.index.month==6)| (hpd.index.month==7)| (hpd.index.month==8)]
In [ ]:
summer['Complaint Type'].value_counts().head(5)
In [ ]: